Part 1: stats
#install required package
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("FlowSOM") #for flowsom clustering
BiocManager::install("flowCore") #for flowsom clustering
BiocManager::install("cluster") #for silhouette score
BiocManager::install("fpc") #for ch index
BiocManager::install('clv') #for dbi
BiocManager::install('Seurat') #for pca and flowsom visualization
library(FlowSOM)
library(flowCore)
library(cluster)
library(fpc)
library(clv)
library(Seurat)
library(dplyr)
rm(list=ls())
#input file path, change if needed
fileName <-"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/prepro_outsflowset.csv"
# note: current matrix sample ID have cell index # attached.
df <- read.csv(fileName)
head(df)
print(dim(df)) # this is specific df has 73578 cells
# the preprocessing output csv needs to be cleaned - it contains live dead, FSC, SSC and the sample column
df2 <- df %>% select(-c("Live.Dead",FSC,SSC,X,Batch,cell))
m <- as.matrix(df2) # flowset takes ina matrix not df
#m <- om[1:3000,] #subset (n=3000), omit this to test the whole file - I didn't subset here but too 9000 or max cells from each file before
#try scaling
# SOM = self organizing map, MST = minimal spanning tree
# if reading in a csv convert to flowset
frame <- new("flowFrame", exprs = m) #convert input to flowframe
fs <- ReadInput(frame) #convert flowframe to flowsom object
# fs <- BuildSOM(fs,colsToUse=(-1)) #-1 because we are not using "X" column to build SOM
fs <- BuildSOM(fs) # build flowSOM object, no need for -1 because I cleaned the df about before making flowset
fs <- BuildMST(fs) # build minimum spanning tree
# BuildMST(flowSOM object generated by buildSOM)
# save the FlowSOM MST object to try and avoid memory error
saveRDS(fs,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/FlowSOMin.rds")
Saved the flowset object I created and load the object here - but I didn’t save the filtered df converted to matrix, which is needed for the stats below
fs <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/FlowSOMin.rds")
# note to Shuming - we should be able to go directly from flowset in preprocessing into this flowset but then the 9 samples (or however many will be in separate frames)
# I don't know where the actual values are located
# skip
#based on flowsom, the optimal k number of clusters = 5 for the sample (n=3000)
metacl <- MetaClustering(fs$map$codes,
"metaClustering_consensus",
max = 15)
unique(metacl[fs$map$mapping[, 1]])
#a function that calculate 3 stats for k number of clustering
# the silhouette stat takes too much memory and we will not run this for now.
# Shuming I need this function to take in a list of clustering indexes and an expression matrix
stats <- function(krange){
# si_li <-list()
ch_li <-list()
dbi_li <-list()
for (k in krange) {
#flowsom clustering, try each k in krange
metaClustering <- (metaClustering_consensus(fs$map$codes,k = k,seed=42)) # here we are clustering within the loop
# @Shuming we need to change this so that the loop can pull the results of clustering out of another data object
# how am I going to test the Seurat or Phenograph clustering?
#calculate silhouette score
# cluster index, distance matrix of pairs --- maybe calculate in advance
#si <- silhouette(metaClustering[fs$map$mapping[, 1]],dist(m),)
#si_li[k] <- mean(si[, 3])
#calculate Calinski-Harabasz index
# calinhara(x,clustering,cn=max(clustering))
# where x is a matrix or dataframe
ch = calinhara(m,metaClustering[fs$map$mapping[, 1]],cn=max(metaClustering[fs$map$mapping[,1]]))
ch_li[k] <- ch
#calculate Davies–Bouldin index
dbi = clv.Davies.Bouldin(cls.scatt.data(
m,
metaClustering[fs$map$mapping[, 1]]),
intracls = "average",
intercls = "average"
)
dbi_li[k] <- dbi[1]
}
return(list(ch_li, dbi_li))
}
# run the function to get the stats
krange <- 3:15 #range of number of clusters
li = stats(krange) # doesn't contain silhouette
# list of 2
#silhouette score: ranges from -1 to 1
#-1: bad clusters 0: neutral, indifferent 1: good clusters
#plot(krange, type='b', li[[1]][krange], xlab='Number of clusters', ylab='Average Silhouette Scores', frame=TRUE)
#Calinski-Harabasz index:
# the highest value is the optimal number of clusters
plot(krange, type='b', li[[1]][krange], xlab='Number of clusters', ylab='Calinski-Harabasz index', frame=TRUE)
#Davies–Bouldin index: minimum score is zero
#the lowest value is the optimal number of clusters
plot(krange, type='b', li[[2]][krange], xlab='Number of clusters', ylab='Davies–Bouldin index', frame=TRUE)
The Calinski-Harabasz index show that K=8 is best number of clusters. But the Davies–Bouldin index shows lower is better. It is okay up until 9.
Try statistics on other clustering.
ch.pheno.k.50 = calinhara(m,df$phenograph,length((unique(df$phenograph))))
Error in if (cln[i] < 2) cclx <- 0 : argument is of length zero
David Bouldin index for Phenograph
make this calculations on the Seurat clustering
# read in the seurat object with the clustering values
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/SeuratfromFlowsomPheno.rds" )
CH index and DBI from Seurat (Louvain) try one resolution first
dbi.louv.res.1 <- clv.Davies.Bouldin(cls.scatt.data(
m,
seu@meta.data$RNA_snn_res.1),
intracls = "average",
intercls = "average"
)
Error in cls.id.vect.validity(clust, "clust") :
Bad usage: input 'clust' should be vector type.
NA
Calculate all CHI for Louvain
ch_louv <-list()
l.range <- c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")
resolution = c(0.1,0.25,0.5,0.75,1)
prefix = "RNA_snn_res."
# loop doesn't work because I can't index the seurat resolutions
ch = calinhara(m,seu@meta.data$RNA_snn_res.0.1,length((unique(seu@meta.data$RNA_snn_res.0.1))))
ch_louv["RNA_snn_res.0.1"] <- ch
ch = calinhara(m,seu@meta.data$RNA_snn_res.0.25,length((unique(seu@meta.data$RNA_snn_res.0.25))))
ch_louv["RNA_snn_res.0.25"] <- ch
ch = calinhara(m,seu@meta.data$RNA_snn_res.0.5,length((unique(seu@meta.data$RNA_snn_res.0.5))))
ch_louv["RNA_snn_res.0.5"] <- ch
ch = calinhara(m,seu@meta.data$RNA_snn_res.0.75,length((unique(seu@meta.data$RNA_snn_res.0.75))))
ch_louv["RNA_snn_res.0.75"] <- ch
ch = calinhara(m,seu@meta.data$RNA_snn_res.1,length((unique(seu@meta.data$RNA_snn_res.1))))
ch_louv["RNA_snn_res.1"] <- ch
Plot the CHI from Louvain clusters

Make a table with the CHI outputs
df.chi <- as.data.frame(Method, CHI.value)
Warning in as.data.frame.vector(x, ..., nm = nm) :
'row.names' is not a character vector of length 7 -- omitting it. Will be an error!
Make another plot
#library(ggplot2)
ggplot(df.chi, aes(x= Method, y= CHI.value)) + geom_point() +
theme(axis.text.x = element_text(angle = 90))

Part 2: plot
#transpose the csv so that seurat object has the right column and row
# the flow intensity values will be input as RNA expression
tm <- t(df2)
rownames(tm) <- colnames(df2)
colnames(tm) <- rownames(df2)
k <- 10 #change number of cluster here
k <- 8 # best number of clusters according to CH index
metaClustering <- (metaClustering_consensus(fs$map$codes,k = k,seed=42)) # flowSOM clustering?
seu <- CreateSeuratObject(tm) # create a seurat object
# check the seurat object by plotting some features
print(colnames(df2))
allAB <- colnames(df2)
VlnPlot(seu,features = c("CD56","AQP4")) # eerror in plot
DotPlot(seu, features = c("CD56","AQP4","CD24","GLAST","CD140a","CD29","CD184","CD71","O4","HepaCAM","CD133"))
# scale the data
seu <- ScaleData(seu)
DotPlot(seu, features = c("CD56","AQP4","CD24","GLAST","CD140a","CD29","CD184","CD71","O4","HepaCAM","CD133"))
DotPlot(seu, features = allAB)
#do i need to normalize?
# seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
# create a UMAP
# to do so PCA must be run first
seu <- FindVariableFeatures(seu) # needed for PCA or select all features
#dimentionality reduction methods: pca and umap
seu <-RunPCA(seu,seed.use = 42) # will use variable features by default
seu <- RunPCA(seu, features = allAB)
print(seu[["pca"]], dims = 1:5, nfeatures = 5) # tells you what AB are contributing to the
seu <- RunUMAP(seu,features = allAB)
# Add the clustering data to the seurat object
seu <- AddMetaData(object=seu, metadata=metaClustering[fs$map$mapping[,1]], col.name = 'flowSOM.k.8')
DimPlot(seu, group.by = "flowSOM", reduction = "pca")
DimPlot(seu, group.by = "flowSOM", reduction = "umap") # from normal PCA used as umap input it is mostly on blob, I get the same shape when using all AB features as PCA input
DotPlot (seu, features = allAB, group.by = "flowSOM")
#allow for color labeling
Idents(seu) <- "flowSOM"
FLowSOM clustering visually appear to be terrible.
I’ll just try the seurat clustering
# cluster using Louvain clustering
# using the square root of the number of cells for K might work better
#sqrt(73578) = 271.25
seu <- FindNeighbors(seu, dims = 1:10, k = 271)
seu <- FindClusters(seu, resolution = c(0.1,0.25,0.5,0.75,1))
clustree(seu, prefix = "RNA_snn_res.") + theme(legend.position = "bottom")
# this is very slow - do not run again in notebook
library(clustree)
clustree(seu, prefix = "RNA_snn_res.") + theme(legend.position = "bottom")

# from Clustree 0.5 looks like the best resolution
# c(0.1,0.25,0.5,0.75,1)
resolutions = c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")
for (res in resolutions){
print(DimPlot(seu, reduction = "umap", repel = TRUE, label = TRUE, group.by = res))
}





NA
NA
The clusters are not well separated, but better than with flowSOM (visually) Lets see how a dotplot looks
DotPlot(seu, group.by = "RNA_snn_res.0.5", features = allAB, scale = TRUE) # why is the intensity ploted?
# the data must not be working correctly
# maybe the object is not really correct at all
FeatureScatter(seu, feature1 = "CD56",feature2 = "CD24")
FeatureScatter(seu, feature1 = "CD29",feature2 = "CD184")
# maybe the data needs to be normalized
seu <- NormalizeData(seu)
DoHeatmap(seu, features = allAB, group.by = "RNA_snn_res.0.5")
# the groups look okay by heatmap. I'll try the different resolutions
# c(0.1,0.25,0.5,0.75,1)
resolutions = c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")
for (res in resolutions){
print(DoHeatmap(seu, features = allAB, group.by = res))
}





# try the dotplot again
# c(0.1,0.25,0.5,0.75,1)
resolutions = c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")
for (res in resolutions){
print(DotPlot(seu, features = allAB, group.by = res, cols = c("blue","red")))
}
# what is happening?
Try feature maps
for (AB in allAB){
print(FeaturePlot(seu, features = AB),slot = 'scale.data',min.cutoff = 'q10', max.cutoff ='90')
}













# autothreshold isn't great - I set quartiles
#FeaturePlot(seu, features = "O4", min.cutoff = 1, max.cutoff = 25)
#FeaturePlot(seu, features = "O4", slot = 'scale.data',min.cutoff = 0.1, max.cutoff = 25)
#FeaturePlot(seu, features = "O4", slot = 'scale.data',min.cutoff = 'q5', max.cutoff ='99')
Try ridge plots
RidgePlot(seu, features = c("CD56","CD24"), ncol = 2) # missing values???
VlnPlot(seu, features = c("CD56","CD24"), ncol = 2) # also missing values???
I’ll save the seurat object and see about trying to cluster with phenograph. I believe I have read in the unalligned not normalized data
saveRDS(seu, "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/SeuratfromFlowsom.rds")
Phenograph clustering uses Jaccard coefficient between neruest neigbors sets and then id communities by louvain
# install
if(!require(devtools)){
install.packages("devtools") # If not already installed
}
devtools::install_github("JinmiaoChenLab/Rphenograph")
library(Rphenograph)
---
title: "stats: silhouette score, ch, dbi"
output: html_notebook
---

Part 1: stats
```{r}

#install required package
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("FlowSOM") #for flowsom clustering
BiocManager::install("flowCore") #for flowsom clustering
BiocManager::install("cluster") #for silhouette score
BiocManager::install("fpc") #for ch index
BiocManager::install('clv') #for dbi

BiocManager::install('Seurat') #for pca and flowsom visualization
```

```{r}
library(FlowSOM)
library(flowCore)
library(cluster)
library(fpc)
library(clv)
library(Seurat)
library(dplyr)
rm(list=ls())

```


```{r}

#input file path, change if needed
fileName <-"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/prepro_outsflowset.csv"

# note: current matrix sample ID have cell index # attached. 

df <- read.csv(fileName)
head(df)
print(dim(df)) # this is specific df has 73578 cells
# the preprocessing output csv needs to be cleaned - it contains live dead, FSC, SSC and the sample column
df2 <- df %>% select(-c("Live.Dead",FSC,SSC,X,Batch,cell))

m <- as.matrix(df2) # flowset takes ina matrix not df 
#m <- om[1:3000,] #subset (n=3000), omit this to test the whole file  - I didn't subset here but too 9000 or max cells from each file before

#try scaling
# SOM = self organizing map, MST = minimal spanning tree

# if reading in a csv convert to flowset
frame <- new("flowFrame", exprs = m) #convert input to flowframe
fs <- ReadInput(frame) #convert flowframe to flowsom object
# fs <- BuildSOM(fs,colsToUse=(-1)) #-1 because we are not using "X" column to build SOM
fs <- BuildSOM(fs) # build flowSOM object, no need for -1 because I cleaned the df about before making flowset 
fs <- BuildMST(fs) # build minimum spanning tree 
# BuildMST(flowSOM object generated by buildSOM)

# save the FlowSOM MST object to try and avoid memory error
saveRDS(fs,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/FlowSOMin.rds")


```

Saved the flowset object I created and load the object here - but I didn't save the filtered df converted to matrix, which is needed for the stats below

```{r}
fs <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/FlowSOMin.rds")

# note to Shuming - we should be able to go directly from flowset in preprocessing into this flowset but then the 9 samples (or however many will be in separate frames)
# I don't know where the actual values are located

```



```{r}
# skip
#based on flowsom, the optimal k number of clusters = 5 for the sample (n=3000) 
metacl <- MetaClustering(fs$map$codes,
"metaClustering_consensus",
max = 15)

unique(metacl[fs$map$mapping[, 1]])
```





```{r}

#a function that calculate 3 stats for k number of clustering
# the silhouette stat takes too much memory and we will not run this for now.

# Shuming I need this function to take in a list of clustering indexes and an expression matrix

stats <- function(krange){
 # si_li <-list()
  ch_li <-list()
  dbi_li <-list()
  
  for (k in krange) {
    
    #flowsom clustering, try each k in krange
    metaClustering <- (metaClustering_consensus(fs$map$codes,k = k,seed=42)) # here we are clustering within the loop
    # @Shuming we need to change this so that the loop can pull the results of clustering out of another data object
    # how am I going to test the Seurat or Phenograph clustering?

    #calculate silhouette score
    # cluster index, distance matrix of pairs ---  maybe calculate in advance 
    #si <- silhouette(metaClustering[fs$map$mapping[, 1]],dist(m),)

    #si_li[k] <- mean(si[, 3])
    
    #calculate Calinski-Harabasz index
    # calinhara(x,clustering,cn=max(clustering))
    # where x is a matrix or dataframe
    ch = calinhara(m,metaClustering[fs$map$mapping[, 1]],cn=max(metaClustering[fs$map$mapping[,1]]))
    
    ch_li[k] <- ch
    
    #calculate Davies–Bouldin index
    dbi = clv.Davies.Bouldin(cls.scatt.data(
      m,
      metaClustering[fs$map$mapping[, 1]]),
      intracls = "average",
      intercls = "average"
    )

    dbi_li[k] <- dbi[1]
    } 

  return(list(ch_li, dbi_li))
}



```


```{r}
# run the function to get the stats



krange <- 3:15 #range of number of clusters
li = stats(krange)  # doesn't contain silhouette 
# list of 2 

#silhouette score: ranges from -1  to 1 
#-1: bad clusters  0: neutral, indifferent  1: good clusters
#plot(krange, type='b', li[[1]][krange], xlab='Number of clusters', ylab='Average Silhouette Scores', frame=TRUE)

#Calinski-Harabasz index: 
# the highest value is the optimal number of clusters
plot(krange, type='b', li[[1]][krange], xlab='Number of clusters', ylab='Calinski-Harabasz index', frame=TRUE)

#Davies–Bouldin index: minimum score is zero
#the lowest value is the optimal number of clusters
plot(krange, type='b', li[[2]][krange], xlab='Number of clusters', ylab='Davies–Bouldin index', frame=TRUE)


```
The Calinski-Harabasz index show that K=8 is best number of clusters.  But the Davies–Bouldin index shows lower is better.  It is okay up until 9. 


Try statistics on other clustering.  


```{r}
# from Phenograph 
# read in the df with phenograph clusters and saved expression matrix output from preprocessing - no alignement or normalization
print(colnames(df))
allAB <- c("AQP4","CD56","GLAST", "CD140a","CD29", "CD44", "CD184","CD71","CD24","CD15","O4","HepaCAM","CD133")
# we need just the expression matrix 
df2 <- df %>% select(c("AQP4","CD56","GLAST", "CD140a","CD29", "CD44", "CD184","CD71","CD24","CD15","O4","HepaCAM","CD133"))

m <- as.matrix(df2)


#calculate Calinski-Harabasz index
    # calinhara(x,clustering,cn=max(clustering))
    # where x is a matrix or dataframe
ch.pheno.k.271 = calinhara(m,df$phenograph_clusterk271,length((unique(df$phenograph_clusterk271))))

ch.pheno.k.50 = calinhara(m,df$phenograph_cluster,length((unique(df$phenograph_cluster))))   

# this works well to see the statistics but not great for a plot.
# I might need to just make a tabel




```

David Bouldin index for Phenograph 

```{r}
#calculate Davies–Bouldin index
# indiex.list, intracls, intercls 
# I'm not sure how this is working but it does run
dbi.phen.k.271 <- clv.Davies.Bouldin(cls.scatt.data(
      m,
      df$phenograph_clusterk271),
      intracls = "average",
      intercls = "average"
    )
  
dbi.phen.k.50 <- clv.Davies.Bouldin(cls.scatt.data(
      m,
      df$phenograph_cluster),
      intracls = "average",
      intercls = "average"
    )


```


make this calculations on the Seurat clustering


```{r}
# read in the seurat object with the clustering values
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/SeuratfromFlowsomPheno.rds" )


```

CH index and DBI from Seurat (Louvain) try one resolution first

```{r}

ch.louv.res.1 = calinhara(m,seu@meta.data$RNA_snn_res.1,length((unique(seu@meta.data$RNA_snn_res.1))))
# this works

dbi.louv.res.1 <- clv.Davies.Bouldin(cls.scatt.data(
      m,
      seu@meta.data$RNA_snn_res.1),
      intracls = "average",
      intercls = "average"
    )
# this doesn't work - not sure where to specify how to find the data to calculate intracls and intercls


```

Calculate all CHI for Louvain

```{r}

  ch_louv <-list()
l.range <- c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")
resolution = c(0.1,0.25,0.5,0.75,1)
prefix = "RNA_snn_res." 
# loop doesn't work because I can't index the seurat resolutions

ch = calinhara(m,seu@meta.data$RNA_snn_res.0.1,length((unique(seu@meta.data$RNA_snn_res.0.1))))
ch_louv["RNA_snn_res.0.1"] <- ch

ch = calinhara(m,seu@meta.data$RNA_snn_res.0.25,length((unique(seu@meta.data$RNA_snn_res.0.25))))
ch_louv["RNA_snn_res.0.25"] <- ch

ch = calinhara(m,seu@meta.data$RNA_snn_res.0.5,length((unique(seu@meta.data$RNA_snn_res.0.5))))
ch_louv["RNA_snn_res.0.5"] <- ch

ch = calinhara(m,seu@meta.data$RNA_snn_res.0.75,length((unique(seu@meta.data$RNA_snn_res.0.75))))
ch_louv["RNA_snn_res.0.75"] <- ch

ch = calinhara(m,seu@meta.data$RNA_snn_res.1,length((unique(seu@meta.data$RNA_snn_res.1))))
ch_louv["RNA_snn_res.1"] <- ch




```

Plot the CHI from Louvain clusters
```{r}

# the highest value is the optimal number of clusters
y <- sapply(ch_louv, '[', )
plot(resolution, y, xlab='Resolution of clusters',type = "b", ylab='Calinski-Harabasz index', frame=TRUE)




```

Make a table with the CHI outputs

```{r}

Cluster.type <- c("Phenograph.k.50","Phenograph.k.271",l.range)
print(Cluster.type)
Method <- c("Phenograph.k.50" , "Phenograph.k.271" ,"RNA_snn_res.0.1",  "RNA_snn_res.0.25" ,"RNA_snn_res.0.5" , "RNA_snn_res.0.75", "RNA_snn_res.1")

print(y)

CHI.value <- c(7418.299,7806.130, 52521.120,  9930.235,  7734.065,  4949.593,  4229.252)



df.chi <- cbind.data.frame(Method, CHI.value)




```

Make another plot

```{r}
#library(ggplot2)
ggplot(df.chi, aes(x= Method, y= CHI.value)) + geom_point() +
  theme(axis.text.x = element_text(angle = 90))

# how do these values compare with 

```




Part 2: plot

```{r}

#transpose the csv so that seurat object has the right column and row
# the flow intensity values will be input as RNA expression
tm <- t(df2)
rownames(tm) <- colnames(df2)
colnames(tm) <- rownames(df2)


k <- 10 #change number of cluster here 
k <- 8 # best number of clusters according to CH index

metaClustering <- (metaClustering_consensus(fs$map$codes,k = k,seed=42)) # flowSOM clustering?

seu <- CreateSeuratObject(tm) # create a seurat object 
# check the seurat object by plotting some features
print(colnames(df2))
allAB <- colnames(df2)
VlnPlot(seu,features = c("CD56","AQP4")) # eerror in plot
DotPlot(seu, features = c("CD56","AQP4","CD24","GLAST","CD140a","CD29","CD184","CD71","O4","HepaCAM","CD133"))
# scale the data 
seu <- ScaleData(seu)

DotPlot(seu, features = c("CD56","AQP4","CD24","GLAST","CD140a","CD29","CD184","CD71","O4","HepaCAM","CD133"))
DotPlot(seu, features = allAB)
#do i need to normalize?
# seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)

# create a UMAP
# to do so PCA must be run first 
seu <- FindVariableFeatures(seu) # needed for PCA or select all features
#dimentionality reduction methods: pca and umap

seu <-RunPCA(seu,seed.use = 42) # will use variable features by default
seu <- RunPCA(seu, features = allAB)

print(seu[["pca"]], dims = 1:5, nfeatures = 5) # tells you what AB are contributing to the 
seu <- RunUMAP(seu,features = allAB)

# Add the clustering data to the seurat object
seu <- AddMetaData(object=seu, metadata=metaClustering[fs$map$mapping[,1]], col.name = 'flowSOM.k.8')

DimPlot(seu, group.by = "flowSOM", reduction = "pca") 
DimPlot(seu, group.by = "flowSOM", reduction = "umap") # from normal PCA used as umap input it is mostly on blob, I get the same shape when using all AB features as PCA input

DotPlot (seu, features = allAB, group.by = "flowSOM")


#allow for color labeling
Idents(seu) <- "flowSOM"
```

FLowSOM clustering visually appear to be terrible.

I'll just try the seurat clustering

```{r}

# cluster using Louvain clustering
# using the square root of the number of cells for K might work better
#sqrt(73578) = 271.25
seu <- FindNeighbors(seu, dims = 1:10, k = 271)
seu <- FindClusters(seu, resolution = c(0.1,0.25,0.5,0.75,1))
clustree(seu, prefix = "RNA_snn_res.") + theme(legend.position = "bottom")
# this is  very slow - do not run again in notebook

```

```{r}
library(clustree)
clustree(seu, prefix = "RNA_snn_res.") + theme(legend.position = "bottom")
```

```{r}
# from Clustree 0.5 looks like the best resolution

# c(0.1,0.25,0.5,0.75,1)
resolutions = c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")

for (res in resolutions){
 print(DimPlot(seu, reduction = "umap", repel = TRUE, label = TRUE, group.by = res))
 
}


```

The clusters are not well separated, but better than with flowSOM (visually)
Lets see how a dotplot looks

```{r}
DotPlot(seu, group.by = "RNA_snn_res.0.5", features = allAB, scale = TRUE) # why is the intensity ploted?
# the data must not be working correctly


# maybe the object is not really correct at all
FeatureScatter(seu, feature1 = "CD56",feature2 = "CD24")
FeatureScatter(seu, feature1 = "CD29",feature2 = "CD184")

# maybe the data needs to be normalized
seu <- NormalizeData(seu)

DoHeatmap(seu, features = allAB, group.by = "RNA_snn_res.0.5")
# the groups look okay by heatmap. I'll try the different resolutions 



```

```{r}
# c(0.1,0.25,0.5,0.75,1)
resolutions = c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")

for (res in resolutions){
 print(DoHeatmap(seu, features = allAB, group.by = res))
 
}

```

```{r}
# try the dotplot again
# c(0.1,0.25,0.5,0.75,1)
resolutions = c("RNA_snn_res.0.1","RNA_snn_res.0.25","RNA_snn_res.0.5","RNA_snn_res.0.75","RNA_snn_res.1")

for (res in resolutions){
 print(DotPlot(seu, features = allAB, group.by = res, cols = c("blue","red")))
 
}

# what is happening?

```

Try feature maps

```{r}


for (AB in allAB){
 print(FeaturePlot(seu, features = AB),slot = 'scale.data',min.cutoff = 'q10', max.cutoff ='90')
 
}

# autothreshold isn't great - I set quartiles 

#FeaturePlot(seu, features = "O4", min.cutoff = 1, max.cutoff = 25)
#FeaturePlot(seu, features = "O4", slot = 'scale.data',min.cutoff = 0.1, max.cutoff = 25)

#FeaturePlot(seu, features = "O4", slot = 'scale.data',min.cutoff = 'q5', max.cutoff ='99')




```

Try ridge plots

```{r}
RidgePlot(seu, features = c("CD56","CD24"), ncol = 2) # missing values???
VlnPlot(seu, features = c("CD56","CD24"), ncol = 2) # also missing values???

```


I'll save the seurat object and see about trying to cluster with phenograph.  I believe I have read in the unalligned not normalized data 


```{r}
saveRDS(seu, "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/SeuratfromFlowsom.rds")



```


Phenograph clustering uses Jaccard coefficient between neruest neigbors sets and then id communities by louvain 


```{r}
# install
if(!require(devtools)){
  install.packages("devtools") # If not already installed
}
devtools::install_github("JinmiaoChenLab/Rphenograph")

library(Rphenograph)

```




